
# coding: utf-8

# In[13]:


from requests.exceptions import RequestException
from lxml import etree
from bs4 import BeautifulSoup
from pyquery import PyQuery
import time
import requests
import os
os.chdir("E:\pythonstudy")


# In[31]:


def getpage(url):
    try:
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
        res = requests.get(url,headers = headers)
        if res.status_code == 200:#响应码为200表示正常
            return res.text
        else:
            return None
    except RequestException:
        return None


# In[116]:


def parsepage(content):
    #----使用xpath----
    #初始化，返回根节点对象
    html = etree.HTML(content)

    
#     解释网页
    items = html.xpath("//tr[@class='item']")
    for item in items:
        yield{
            'title':item.xpath(".//div[@class='pl2']/a/@title"),
            'score':item.xpath(".//span[@class='rating_nums']/text()")
            
        }
    #遍历并解析每部电影具体信息


# In[117]:


def writefile(content):
    with open("./result.txt",'a',encoding="utf-8") as f:
        f.write(json.dumps(content,ensure_ascii=False)+"\n")


# In[118]:


def main(offset):
    url = 'https://book.douban.com/top250?start='+str(offset)
    html = getpage(url)#执行爬取
    if html:
        parsepage(html)

    if html:
        for item in parsepage(html):
            print(item)


# In[ ]:


if __name__ == '__main__':
    main(0)
    for i in range(10):
        main(offset=i*25)
        time.sleep(1)

